# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3

import json

import datasets
from dataclasses import dataclass


_CITATION = '''
'''

languages2filesize = { 
    'ar': 32,
    'de': 150,
    'en': 352,
    'es': 102,
    'fr': 134,
    'hi': 5,
    'it': 83,
    'ja': 47,
    'ko': 13,
    'simple': 5,
    'zh': 23
}

_DESCRIPTION = 'dataset load script'

_DATASET_URLS = {
    lang: [f'https://huggingface.co/datasets/Cohere/wikipedia-22-12/resolve/main/{lang}/{str(i).zfill(3)}.jsonl.gz' for i in range(n)]
     for lang, n in languages2filesize.items()
}


class WikiCorpus(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            version=datasets.Version('1.0.0'),
            name=lang, 
            description=f'Wiki dataset in language {lang}.'
        ) for lang in languages2filesize
    ]

    def _info(self):
   
        features = datasets.Features({
            'id': datasets.Value('int32'),
            'title': datasets.Value('string'),
            'text': datasets.Value('string'),
            'url': datasets.Value('string'),
            'wiki_id': datasets.Value('int32'),
            'views': datasets.Value('float32'),
            'paragraph_id': datasets.Value('int32'),
            'langs': datasets.Value('int32'), 
            #'emb':  datasets.Sequence(datasets.Value("float32")) 
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage='https://www.cohere.ai',
            # License for the dataset if available
            license='',
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        lang = self.config.name
        downloaded_files = dl_manager.download_and_extract(_DATASET_URLS[lang])

        splits = [
            datasets.SplitGenerator(
                name='train',
                gen_kwargs={
                    'filepaths': downloaded_files,
                },
            ),
        ]
        return splits

    def _generate_examples(self, filepaths):
        for filepath in filepaths:
            with open(filepath, encoding="utf-8") as f:
                for line in f:
                    data = json.loads(line)
                    yield data['id'], data
